In [117]:
import praw
from tqdm import tqdm
import re
from collections import Counter
import json
import pandas as pd
import nltk
from nltk.corpus import stopwords
import numpy as np
from praw.models.reddit.comment import Comment 
In [118]:
reddit = praw.Reddit(
    client_id="",
    client_secret="",
    user_agent="Comment Extraction",
)
In [ ]:
def getAllCommentsFromHotPosts(sub,postlimit=20,commentlimit=5,fullComment=False):
    subreddit = reddit.subreddit(sub)
    out = []
    subms = list(subreddit.hot(limit=postlimit))
    for i in tqdm(range(len(subms))):
        submission = subms[i]
        submission.comments.replace_more(limit=commentlimit)
        for comment in submission.comments.list():
            if fullComment:
                out.append(comment)
            else:
                out.append(comment.body)
    return out
In [120]:
wsbComments = getAllCommentsFromHotPosts("wallstreetbets",fullComment=True)
len(wsbComments)
100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:44<00:00,  2.20s/it]
Out[120]:
6876
In [121]:
def getAllBodies(comments):
    return list(map(lambda c : c.body, comments))

def getAllTimes(comments):
    return list(map(lambda c : c.created_utc, comments))

def setAllBodies(comments,newBodies):
    assert len(comments)==len(newBodies)
    for comm,newbody in zip(comments,newBodies):
        if len(newbody):
            comm.body = newbody

def filterEmptyBodies(comments):
    return list(filter(lambda c : len(c.body)>0,comments))
In [122]:
def preprocess(data,skipEmpty=True):
    newdata = []
    for text in data:
        # delete user reports,removed,deleted
        if re.search(r'\*\*User Report\*\*|\[removed\]|\[deleted\]', text):
            text = ""
        
        # remove URLS
        text = re.sub(r'http\S+', ' ', text)
        # remove imgs
        text = re.sub(r'!\[img\]\S+', ' ', text)
        # remove brackets
        text = re.sub(r'\[|\]', ' ', text)
        # remove newlines
        text = text.replace('\n'," ")
        # remove double spaces
        text = text.replace('  '," ")
        # strip
        text = text.strip()
        # lower
        text = text.lower()
        
        if len(text)==0 and skipEmpty:
            continue
            
        newdata.append(text)
            
    return newdata
In [123]:
rawText = getAllBodies(wsbComments)
procText = preprocess(rawText,skipEmpty=False)
setAllBodies(wsbComments,procText)
In [124]:
tickers = pd.read_json('company_tickers.json', orient='index')[["ticker","title"]].iloc[:1500,:]
In [125]:
tickers["ticker"] = tickers["ticker"].str.lower()
tickers["title"] = tickers["title"].str.lower()
tickers["title_first"] = tickers["title"].str.split().str.get(0).str.replace(',', '').str.replace('.', '')
In [126]:
raw_all = getAllCommentsFromHotPosts("all",postlimit=40)
data_all = preprocess(raw_all)
len(data_all)
100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [01:16<00:00,  1.91s/it]
Out[126]:
12341
In [127]:
def wordFreqs(data):
    cnt = Counter()
    totalwords = 0
    for comment in data:
        words = comment.split(' ')
        for word in words:
            word = word.replace('.','')
            word = word.replace('!','')
            word = word.replace('?','')
            cnt.update([word])
        totalwords+=len(words)
    for k in cnt:
        cnt[k] = np.log(cnt[k])-np.log(totalwords)
    return cnt

backg_freqs = wordFreqs(data_all)
In [128]:
keywordToTicker = {}
for idx,row in tickers.iterrows():
    ticker = row["ticker"]
    title = row["title_first"]
    thresh = -12.1
    if backg_freqs[ticker]==0 or backg_freqs[ticker] < thresh:
        keywordToTicker[ticker] = ticker
    if backg_freqs[title]==0 or backg_freqs[title] < thresh:
        keywordToTicker[title] = ticker
In [129]:
def bucketComments(comments,keyDict):
    buckets = {}
    for comment in comments:
        usedTickers = set()
        for word in comment.body.split(' '):
            if word in keyDict:
                ticker = keyDict[word]
                if ticker not in usedTickers:
                    if ticker not in buckets:
                        buckets[ticker] = []
                    buckets[ticker].append(comment)
                    usedTickers.add(ticker)
    return buckets
buckets = bucketComments(wsbComments,keywordToTicker)
In [130]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

def scoreTexts(texts, model, tokenizer, config):
    encoded_input = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    output = model(**encoded_input)
    scores = output[0][:].detach().numpy()
    scores = softmax(scores,axis=1)
    maxids = np.argsort(scores,axis=1)[:,-1]
    labs = list(map(lambda i : config.id2label[i], maxids))
    return scores,labs
In [131]:
def analyzeBuckets(buckets):
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    config = AutoConfig.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    
    dfs = []
    for ticker,comments in buckets.items():
        bodies = getAllBodies(comments)
        scores,labels = scoreTexts(bodies, model, tokenizer, config)
        df = pd.DataFrame(scores,columns=["negative","neutral","positive"])
        df["sentiment"] = labels
        df["ticker"] = ticker
        df["comment"] = bodies
        df["created"] = pd.to_datetime(getAllTimes(comments),unit='s')
        dfs.append(df)
    out = pd.concat(dfs)[["comment","created","ticker","negative","neutral","positive","sentiment"]].reset_index(drop=True)
    pos_adj = out['positive']+(0.5*out['neutral'])
    out['pos_norm'] = (pos_adj - np.mean(pos_adj))/np.std(pos_adj)
    return out
        
analyzed = analyzeBuckets(buckets)
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
In [132]:
analyzed
Out[132]:
comment created ticker negative neutral positive sentiment pos_norm
0 the year is 2100...we have ai powered robots e... 2024-12-06 16:14:41 sbux 0.573358 0.385036 0.041605 negative -0.656183
1 a starbucks grande mocha costs $205,000 trump ... 2024-12-06 21:35:26 sbux 0.279288 0.629626 0.091086 neutral -0.012359
2 i did that with my last $50 left in my account... 2024-12-09 21:38:22 sbux 0.142672 0.389270 0.468058 positive 0.950121
3 my bad. this is a great write up. do you have ... 2024-12-09 19:36:56 sbux 0.652351 0.198814 0.148835 negative -0.603266
4 why is sbux a bad company? they seem great, an... 2024-12-09 23:00:38 sbux 0.119699 0.235666 0.644635 positive 1.324084
... ... ... ... ... ... ... ... ...
1158 >!i am so poor, i switched from at&t postpaid ... 2024-12-09 22:55:47 t 0.902450 0.085595 0.011955 negative -1.328475
1159 i'm all in on tsla and rbrk right now. it's a ... 2024-12-09 21:12:10 rbrk 0.003167 0.023927 0.972906 positive 2.157657
1160 shit i’m full port alab till 250b market cap 2024-12-09 22:46:22 alab 0.809970 0.168262 0.021768 negative -1.136777
1161 4bagger on khc calls today. it’s been a minute... 2024-12-09 22:52:29 khc 0.067815 0.842997 0.089187 neutral 0.380389
1162 coinbase down and won't let me sell, puts on coin 2024-12-10 03:33:29 coin 0.819851 0.170869 0.009280 negative -1.178695

1163 rows × 8 columns

In [142]:
import plotly
import plotly.express as px
plotly.offline.init_notebook_mode()

pos_norm_summary = analyzed.groupby('ticker').agg(
    n=('pos_norm','count'),  
    avg_pos_norm_score=('pos_norm', 'mean')
).reset_index()

fig = px.treemap(
    pos_norm_summary,
    path=['ticker'],
    values='n',
    color='avg_pos_norm_score',
    color_continuous_scale=["red","white","green"],
    color_continuous_midpoint=0,
    title="Normalized sentiment of stocks from /r/wallstreetbets comments"
)
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()
In [140]:
pos_summary = analyzed[analyzed['sentiment']=='positive'].groupby('ticker').agg(
    n_positive=('positive','count'),  
    avg_pos_score=('positive', 'mean')
).reset_index()
tot_pos = np.sum(analyzed['sentiment']=='positive')
fig = px.treemap(
    pos_summary,
    path=['ticker'],
    values='n_positive',
    color='avg_pos_score',
    color_continuous_scale='Greens',
    title=f'Total positive comments = {tot_pos}'
)
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()
In [141]:
neg_summary = analyzed[analyzed['sentiment']=='negative'].groupby('ticker').agg(
    n_negative=('negative','count'),  
    avg_neg_score=('negative', 'mean')
).reset_index()
tot_neg = np.sum(analyzed['sentiment']=='negative')
fig = px.treemap(
    neg_summary,
    path=['ticker'],
    values='n_negative',
    color='avg_neg_score',
    color_continuous_scale='Reds',
    title=f'Total negative comments = {tot_neg}'
)
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()
In [ ]: